# 提取pdf文字
import pdfplumber

# with pdfplumber.open("C:\\Users\\cindata-hrs\\Desktop\\2024MarMonthlyUnifiedSummary.pdf") as pdf:
#     page01 = pdf.pages[0]  # 指定页码
#     text = page01.extract_text()  # 提取文本
#     print(text)

import re

text = "Shipping credit refunds 0.97 FBA inventory and inbound services fees -845.85"
# amounts = re.findall(r'^([-+])?\d+(\.\d+)?', text)
amounts = re.findall(r'(([-+])?\d+(\.\d+)?)', text)
print(amounts)
print(re.split("0.97|-845.85", text))


def test(file_path):
    bill_list = []
    # 读取文件
    with pdfplumber.open(file_path) as pdf:
        page01 = pdf.pages[0]  # 指定页码
        text = page01.extract_text()  # 提取文本
        # 先按照【Debits Credits Debits Credits】切割
        text = text.split("Debits Credits Debits Credits")[-1]
        # 按照换行符切割text
        text_list = text.split("\n")
        for text in text_list:
            if len(text.strip()) == 0:
                continue
            # 替换逗号
            if "Debits" in text:
                text = text.replace("Debits", "")
            if "Credits" in text:
                text = text.replace("Credits", "")
            text = text.replace(",", "")
            text = text.strip()
            if text.startswith("subtotals"):
                continue
            if text.startswith("Information"):
                continue
            # 金额的正则表达式
            amounts = re.findall(r'(([-+])?\d+(\.\d+)?)', text)
            if len(amounts) == 0:
                continue
            size = len(amounts)
            if size == 1:
                amount = amounts[0][0]
                bill_text = text.replace(amount, "").strip()
                if bill_text == "Transfers" or bill_text == "Tax":
                    continue
                bill_list.append({"billName": bill_text, "amount": amount})
                continue
            amount_list = []
            for i in range(size):
                amount_list.append(amounts[i][0])
            split_text = "|".join(amount_list)
            # 按照多个金额切割
            text_list = re.split(split_text, text)
            # 通过索引取值
            for i in range(size):
                bill_text = text_list[i].strip()
                if bill_text == "Transfers" or bill_text == "Tax":
                    continue
                bill_list.append({"billName": bill_text, "amount": amount_list[i]})
    for bill in bill_list:
        print(bill)


test("C:\\Users\\cindata-hrs\\Desktop\\2024MarMonthlyUnifiedSummary.pdf")